library(jpeg)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.99.9000
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
library(janitor)
library(corrgram)
library(rpart)
img_url <- c("http://curso-r.com/img/blog/desafio-recuperacao-img/purple_wave.jpg",
"http://curso-r.com/img/blog/desafio-recuperacao-img/xadrez_colorido.jpg")
img_name <- basename(img_url)
walk2(img_url,img_name,function(x,y){
download.file(url = x,
destfile = y)
})
images <- map(img_name,readJPEG) %>%
set_names(janitor::clean_names(img_name))
map(images,dim)
## $purple_wave.jpg
## [1] 210 336 3
##
## $xadrez_colorido.jpg
## [1] 161 180 3
# Function to generate dataframe from images
generate_df_from_jpeg <- function(img){
data.frame(
x = rep(1:dim(img)[2], each = dim(img)[1]),
y = rep(dim(img)[1]:1, dim(img)[2]),
r = as.vector(img[,,1]),
g = as.vector(img[,,2]),
b = as.vector(img[,,3])
) %>%
mutate(cor = rgb(r,g,b),
id = row_number())
}
images_df <- map(images, generate_df_from_jpeg)
# Function to split the dataset into train and test
split_train_test <- function(img) {
ret <- list()
ret$test <- img %>%
sample_frac(2/5) %>%
mutate(b_backup = b,
b = 0,
cor = rgb(r, g, b))
ret$train <- img %>%
filter(!id%in%ret[1]$id)
ret
}
images_train_test_df <- map(images_df,split_train_test)
map(images_df, corrgram::corrgram)
## $purple_wave.jpg
## x y r g b
## x 1.00000000 0.000000000 -0.06307728 -0.05403876 -0.06139958
## y 0.00000000 1.000000000 0.78886627 0.72207882 0.77601361
## r -0.06307728 0.788866273 1.00000000 0.97757782 0.99884057
## g -0.05403876 0.722078823 0.97757782 1.00000000 0.98506976
## b -0.06139958 0.776013608 0.99884057 0.98506976 1.00000000
## id 0.99999557 -0.002976157 -0.06542479 -0.05618754 -0.06370885
## id
## x 0.999995571
## y -0.002976157
## r -0.065424789
## g -0.056187540
## b -0.063708846
## id 1.000000000
##
## $xadrez_colorido.jpg
## x y r g b
## x 1.00000000 0.000000000 0.07117786 0.01943599 0.070323647
## y 0.00000000 1.000000000 -0.00145933 -0.00198190 -0.003784754
## r 0.07117786 -0.001459330 1.00000000 0.47843676 0.147598374
## g 0.01943599 -0.001981900 0.47843676 1.00000000 -0.056291052
## b 0.07032365 -0.003784754 0.14759837 -0.05629105 1.000000000
## id 0.99998457 -0.005555448 0.07118487 0.01944670 0.070343588
## id
## x 0.999984568
## y -0.005555448
## r 0.071184871
## g 0.019446701
## b 0.070343588
## id 1.000000000
The first JPEG, purple_wave.jpeg has a very high correlation between the colours and the axis x-y, what suggests a linear relationship. Otherwise, the second JPEG, xadrez_colorido.jpg doesn’t have that relatioship.
Something to worry about is the multicolinearity on the linear regression, as long as it seems that the independent values are highly correlated.
images_lm <- map(images_train_test_df, ~ lm(formula = b ~ x + y + r + g, data = .$train))
images_cart <- map(images_train_test_df, ~ rpart(formula = b ~ x + y + r + g, data = .$train))
# Function to calculate the prediction
predict_images <- function(x,y){
predict(x,y$test)
}
images_lm_predictions <- map2(images_lm,images_train_test_df, predict_images)
images_cart_predictions <- map2(images_cart, images_train_test_df, predict_images)
Evaluate the linear regression
map(images_lm,summary)
## $purple_wave.jpg
##
## Call:
## lm(formula = b ~ x + y + r + g, data = .$train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.040405 -0.002844 -0.000702 0.003174 0.039471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.814e-04 7.169e-05 9.504 < 2e-16 ***
## x 1.642e-06 2.775e-07 5.919 3.26e-09 ***
## y -3.511e-05 7.787e-07 -45.092 < 2e-16 ***
## r 8.191e-01 5.426e-04 1509.611 < 2e-16 ***
## g 3.127e-01 8.115e-04 385.289 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007092 on 70555 degrees of freedom
## Multiple R-squared: 0.9994, Adjusted R-squared: 0.9994
## F-statistic: 2.835e+07 on 4 and 70555 DF, p-value: < 2.2e-16
##
##
## $xadrez_colorido.jpg
##
## Call:
## lm(formula = b ~ x + y + r + g, data = .$train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32229 -0.15811 -0.12039 -0.02142 0.93542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.106e-01 5.096e-03 21.70 <2e-16 ***
## x 3.520e-04 3.508e-05 10.03 <2e-16 ***
## y -2.581e-05 3.912e-05 -0.66 0.509
## r 1.639e-01 4.844e-03 33.84 <2e-16 ***
## g -1.447e-01 5.784e-03 -25.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3095 on 28975 degrees of freedom
## Multiple R-squared: 0.046, Adjusted R-squared: 0.04587
## F-statistic: 349.3 on 4 and 28975 DF, p-value: < 2.2e-16
The result of the linear regression for purple_wave.jpg seems very good. We got a Adjusted R-squared of 0.9994. On the other hand, for xadrez_colorido.jpg the result is not so good.
# Function to calcualte the Mean Absolute Error
MAE <- function(actual, predicted){
mean(abs(actual$test$b - predicted))
}
map2(images_train_test_df,images_lm_predictions,MAE)
## $purple_wave.jpg
## [1] 0.2832589
##
## $xadrez_colorido.jpg
## [1] 0.1717388
Considering that the values are from 0 to 1, the regression tree has a mean difference of 0.17 for xadrez_colorido.jpg, which is way better than the result for purple_wave.jpg
images_train_test_df$purple_wave.jpg$test$b_lm <- images_lm_predictions$purple_wave.jpg
images_train_test_df$purple_wave.jpg$test$b_cart <- images_cart_predictions$purple_wave.jpg
images_train_test_df$xadrez_colorido.jpg$test$b_lm <- images_lm_predictions$xadrez_colorido.jpg
images_train_test_df$xadrez_colorido.jpg$test$b_cart <- images_cart_predictions$xadrez_colorido.jpg
# Function to generate the plot using tidyeval
generate_plot <- function(x, var_name){
var_name <- enquo(var_name)
# The linear model has predicted some values smaller than 0, which is not accepted by rgb function.
df <- x$test %>%
mutate(!!var_name := if_else(!!var_name < 0, 0 , !!var_name),
cor = rgb(r,g,!!var_name))
# Generating the plot
ggplot(df) +
geom_point(aes(x,y),
colour = df$cor)
}
map(images_train_test_df,generate_plot,var_name = b_lm)
## $purple_wave.jpg
##
## $xadrez_colorido.jpg
Using the predicted values by the Linear Regression, the purple wave was satisfactorily recovered. Whilst for “xadrez colorido”, the linear model seems to have failed in predict the colour correctly. It’s not possible to see any tone of blue on the image.
map(images_train_test_df,generate_plot,var_name = b_cart)
## $purple_wave.jpg
##
## $xadrez_colorido.jpg
Regarding “purple wave”, the values predicted by the CART are also visually good on the prediction but the colours are not smooth as the results from the Linear Regression. On the other hand, the CART model predicted satisfactorily the colour tones for “xadrez colorigo”.
Due to non linear relationship between independent and dependent variables, the image “xadrez colorido” has a better prediction result with the regression tree model.